home *** CD-ROM | disk | FTP | other *** search
/ PC World Komputer 2010 April / PCWorld0410.iso / hity wydania / Ubuntu 9.10 PL / karmelkowy-koliberek-9.10-netbook-remix-PL.iso / casper / filesystem.squashfs / usr / share / pyshared / BeautifulSoupTests.py < prev    next >
Text File  |  2009-01-06  |  34KB  |  827 lines

  1. # -*- coding: utf-8 -*-
  2. """Unit tests for Beautiful Soup.
  3.  
  4. These tests make sure the Beautiful Soup works as it should. If you
  5. find a bug in Beautiful Soup, the best way to express it is as a test
  6. case like this that fails."""
  7.  
  8. import unittest
  9. from BeautifulSoup import *
  10.  
  11. class SoupTest(unittest.TestCase):
  12.  
  13.     def assertSoupEquals(self, toParse, rep=None, c=BeautifulSoup,
  14.                          encoding=None):
  15.         """Parse the given text and make sure its string rep is the other
  16.         given text."""
  17.         if rep == None:
  18.             rep = toParse
  19.         obj = c(toParse)
  20.         if encoding is None:
  21.             rep2 = obj.decode()
  22.         else:
  23.             rep2 = obj.encode(encoding)
  24.         self.assertEqual(rep2, rep)
  25.  
  26. class FollowThatTag(SoupTest):
  27.  
  28.     "Tests the various ways of fetching tags from a soup."
  29.  
  30.     def setUp(self):
  31.         ml = """
  32.         <a id="x">1</a>
  33.         <A id="a">2</a>
  34.         <b id="b">3</a>
  35.         <b href="foo" id="x">4</a>
  36.         <ac width=100>4</ac>"""
  37.         self.soup = BeautifulStoneSoup(ml)
  38.  
  39.     def testFindAllByName(self):
  40.         matching = self.soup('a')
  41.         self.assertEqual(len(matching), 2)
  42.         self.assertEqual(matching[0].name, 'a')
  43.         self.assertEqual(matching, self.soup.findAll('a'))
  44.         self.assertEqual(matching, self.soup.findAll(SoupStrainer('a')))
  45.  
  46.     def testFindAllByAttribute(self):
  47.         matching = self.soup.findAll(id='x')
  48.         self.assertEqual(len(matching), 2)
  49.         self.assertEqual(matching[0].name, 'a')
  50.         self.assertEqual(matching[1].name, 'b')
  51.  
  52.         matching2 = self.soup.findAll(attrs={'id' : 'x'})
  53.         self.assertEqual(matching, matching2)
  54.  
  55.         strainer = SoupStrainer(attrs={'id' : 'x'})
  56.         self.assertEqual(matching, self.soup.findAll(strainer))
  57.  
  58.         self.assertEqual(len(self.soup.findAll(id=None)), 1)
  59.  
  60.         self.assertEqual(len(self.soup.findAll(width=100)), 1)
  61.         self.assertEqual(len(self.soup.findAll(junk=None)), 5)
  62.         self.assertEqual(len(self.soup.findAll(junk=[1, None])), 5)
  63.  
  64.         self.assertEqual(len(self.soup.findAll(junk=re.compile('.*'))), 0)
  65.         self.assertEqual(len(self.soup.findAll(junk=True)), 0)
  66.  
  67.         self.assertEqual(len(self.soup.findAll(junk=True)), 0)
  68.         self.assertEqual(len(self.soup.findAll(href=True)), 1)
  69.  
  70.     def testFindallByClass(self):
  71.         soup = BeautifulSoup('<a>Foo</a><a class="1">Bar</a>')
  72.         self.assertEqual(soup.find('a', '1').string, "Bar")
  73.  
  74.     def testFindAllByList(self):
  75.         matching = self.soup(['a', 'ac'])
  76.         self.assertEqual(len(matching), 3)
  77.  
  78.     def testFindAllByHash(self):
  79.         matching = self.soup({'a' : True, 'b' : True})
  80.         self.assertEqual(len(matching), 4)
  81.  
  82.     def testFindAllText(self):
  83.         soup = BeautifulSoup("<html>\xbb</html>")
  84.         self.assertEqual(soup.findAll(text=re.compile('.*')),
  85.                          [u'\xbb'])
  86.  
  87.     def testFindAllByRE(self):
  88.         import re
  89.         r = re.compile('a.*')
  90.         self.assertEqual(len(self.soup(r)), 3)
  91.  
  92.     def testFindAllByMethod(self):
  93.         def matchTagWhereIDMatchesName(tag):
  94.             return tag.name == tag.get('id')
  95.  
  96.         matching = self.soup.findAll(matchTagWhereIDMatchesName)
  97.         self.assertEqual(len(matching), 2)
  98.         self.assertEqual(matching[0].name, 'a')
  99.  
  100.     def testParents(self):
  101.         soup = BeautifulSoup('<ul id="foo"></ul><ul id="foo"><ul><ul id="foo" a="b"><b>Blah')
  102.         b = soup.b
  103.         self.assertEquals(len(b.findParents('ul', {'id' : 'foo'})), 2)
  104.         self.assertEquals(b.findParent('ul')['a'], 'b')
  105.  
  106.     PROXIMITY_TEST = BeautifulSoup('<b id="1"><b id="2"><b id="3"><b id="4">')
  107.  
  108.     def testNext(self):
  109.         soup = self.PROXIMITY_TEST
  110.         b = soup.find('b', {'id' : 2})
  111.         self.assertEquals(b.findNext('b')['id'], '3')
  112.         self.assertEquals(b.findNext('b')['id'], '3')
  113.         self.assertEquals(len(b.findAllNext('b')), 2)
  114.         self.assertEquals(len(b.findAllNext('b', {'id' : 4})), 1)
  115.  
  116.     def testPrevious(self):
  117.         soup = self.PROXIMITY_TEST
  118.         b = soup.find('b', {'id' : 3})
  119.         self.assertEquals(b.findPrevious('b')['id'], '2')
  120.         self.assertEquals(b.findPrevious('b')['id'], '2')
  121.         self.assertEquals(len(b.findAllPrevious('b')), 2)
  122.         self.assertEquals(len(b.findAllPrevious('b', {'id' : 2})), 1)
  123.  
  124.  
  125.     SIBLING_TEST = BeautifulSoup('<blockquote id="1"><blockquote id="1.1"></blockquote></blockquote><blockquote id="2"><blockquote id="2.1"></blockquote></blockquote><blockquote id="3"><blockquote id="3.1"></blockquote></blockquote><blockquote id="4">')
  126.  
  127.     def testNextSibling(self):
  128.         soup = self.SIBLING_TEST
  129.         tag = 'blockquote'
  130.         b = soup.find(tag, {'id' : 2})
  131.         self.assertEquals(b.findNext(tag)['id'], '2.1')
  132.         self.assertEquals(b.findNextSibling(tag)['id'], '3')
  133.         self.assertEquals(b.findNextSibling(tag)['id'], '3')
  134.         self.assertEquals(len(b.findNextSiblings(tag)), 2)
  135.         self.assertEquals(len(b.findNextSiblings(tag, {'id' : 4})), 1)
  136.  
  137.     def testPreviousSibling(self):
  138.         soup = self.SIBLING_TEST
  139.         tag = 'blockquote'
  140.         b = soup.find(tag, {'id' : 3})
  141.         self.assertEquals(b.findPrevious(tag)['id'], '2.1')
  142.         self.assertEquals(b.findPreviousSibling(tag)['id'], '2')
  143.         self.assertEquals(b.findPreviousSibling(tag)['id'], '2')
  144.         self.assertEquals(len(b.findPreviousSiblings(tag)), 2)
  145.         self.assertEquals(len(b.findPreviousSiblings(tag, id=1)), 1)
  146.  
  147.     def testTextNavigation(self):
  148.         soup = BeautifulSoup('Foo<b>Bar</b><i id="1"><b>Baz<br />Blee<hr id="1"/></b></i>Blargh')
  149.         baz = soup.find(text='Baz')
  150.         self.assertEquals(baz.findParent("i")['id'], '1')
  151.         self.assertEquals(baz.findNext(text='Blee'), 'Blee')
  152.         self.assertEquals(baz.findNextSibling(text='Blee'), 'Blee')
  153.         self.assertEquals(baz.findNextSibling(text='Blargh'), None)
  154.         self.assertEquals(baz.findNextSibling('hr')['id'], '1')
  155.  
  156. class SiblingRivalry(SoupTest):
  157.     "Tests the nextSibling and previousSibling navigation."
  158.  
  159.     def testSiblings(self):
  160.         soup = BeautifulSoup("<ul><li>1<p>A</p>B<li>2<li>3</ul>")
  161.         secondLI = soup.find('li').nextSibling
  162.         self.assert_(secondLI.name == 'li' and secondLI.string == '2')
  163.         self.assertEquals(soup.find(text='1').nextSibling.name, 'p')
  164.         self.assertEquals(soup.find('p').nextSibling, 'B')
  165.         self.assertEquals(soup.find('p').nextSibling.previousSibling.nextSibling, 'B')
  166.  
  167. class TagsAreObjectsToo(SoupTest):
  168.     "Tests the various built-in functions of Tag objects."
  169.  
  170.     def testLen(self):
  171.         soup = BeautifulSoup("<top>1<b>2</b>3</top>")
  172.         self.assertEquals(len(soup.top), 3)
  173.  
  174. class StringEmUp(SoupTest):
  175.     "Tests the use of 'string' as an alias for a tag's only content."
  176.  
  177.     def testString(self):
  178.         s = BeautifulSoup("<b>foo</b>")
  179.         self.assertEquals(s.b.string, 'foo')
  180.  
  181.     def testLackOfString(self):
  182.         s = BeautifulSoup("<b>f<i>e</i>o</b>")
  183.         self.assert_(not s.b.string)
  184.  
  185. class ThatsMyLimit(SoupTest):
  186.     "Tests the limit argument."
  187.  
  188.     def testBasicLimits(self):
  189.         s = BeautifulSoup('<br id="1" /><br id="1" /><br id="1" /><br id="1" />')
  190.         self.assertEquals(len(s.findAll('br')), 4)
  191.         self.assertEquals(len(s.findAll('br', limit=2)), 2)
  192.         self.assertEquals(len(s('br', limit=2)), 2)
  193.  
  194. class OnlyTheLonely(SoupTest):
  195.     "Tests the parseOnly argument to the constructor."
  196.     def setUp(self):
  197.         x = []
  198.         for i in range(1,6):
  199.             x.append('<a id="%s">' % i)
  200.             for j in range(100,103):
  201.                 x.append('<b id="%s.%s">Content %s.%s</b>' % (i,j, i,j))
  202.             x.append('</a>')
  203.         self.x = ''.join(x)
  204.  
  205.     def testOnly(self):
  206.         strainer = SoupStrainer("b")
  207.         soup = BeautifulSoup(self.x, parseOnlyThese=strainer)
  208.         self.assertEquals(len(soup), 15)
  209.  
  210.         strainer = SoupStrainer(id=re.compile("100.*"))
  211.         soup = BeautifulSoup(self.x, parseOnlyThese=strainer)
  212.         self.assertEquals(len(soup), 5)
  213.  
  214.         strainer = SoupStrainer(text=re.compile("10[01].*"))
  215.         soup = BeautifulSoup(self.x, parseOnlyThese=strainer)
  216.         self.assertEquals(len(soup), 10)
  217.  
  218.         strainer = SoupStrainer(text=lambda(x):x[8]=='3')
  219.         soup = BeautifulSoup(self.x, parseOnlyThese=strainer)
  220.         self.assertEquals(len(soup), 3)
  221.  
  222. class PickleMeThis(SoupTest):
  223.     "Testing features like pickle and deepcopy."
  224.  
  225.     def setUp(self):
  226.         self.page = """<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN"
  227. "http://www.w3.org/TR/REC-html40/transitional.dtd">
  228. <html>
  229. <head>
  230. <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
  231. <title>Beautiful Soup: We called him Tortoise because he taught us.</title>
  232. <link rev="made" href="mailto:leonardr@segfault.org">
  233. <meta name="Description" content="Beautiful Soup: an HTML parser optimized for screen-scraping.">
  234. <meta name="generator" content="Markov Approximation 1.4 (module: leonardr)">
  235. <meta name="author" content="Leonard Richardson">
  236. </head>
  237. <body>
  238. <a href="foo">foo</a>
  239. <a href="foo"><b>bar</b></a>
  240. </body>
  241. </html>"""
  242.  
  243.         self.soup = BeautifulSoup(self.page)
  244.  
  245.     def testPickle(self):
  246.         import pickle
  247.         dumped = pickle.dumps(self.soup, 2)
  248.         loaded = pickle.loads(dumped)
  249.         self.assertEqual(loaded.__class__, BeautifulSoup)
  250.         self.assertEqual(loaded.decode(), self.soup.decode())
  251.  
  252.     def testDeepcopy(self):
  253.         from copy import deepcopy
  254.         deepcopy(BeautifulSoup("<a></a>"))
  255.         copied = deepcopy(self.soup)
  256.         self.assertEqual(copied.decode(), self.soup.decode())
  257.  
  258.     def testUnicodePickle(self):
  259.         import cPickle as pickle
  260.         html = "<b>" + chr(0xc3) + "</b>"
  261.         soup = BeautifulSoup(html)
  262.         dumped = pickle.dumps(soup, pickle.HIGHEST_PROTOCOL)
  263.         loaded = pickle.loads(dumped)
  264.         self.assertEqual(loaded.decode(), soup.decode())
  265.  
  266.  
  267. class WriteOnlyCode(SoupTest):
  268.     "Testing the modification of the tree."
  269.  
  270.     def testModifyAttributes(self):
  271.         soup = BeautifulSoup('<a id="1"></a>')
  272.         soup.a['id'] = 2
  273.         self.assertEqual(soup.decode(), '<a id="2"></a>')
  274.         del(soup.a['id'])
  275.         self.assertEqual(soup.decode(), '<a></a>')
  276.         soup.a['id2'] = 'foo'
  277.         self.assertEqual(soup.decode(), '<a id2="foo"></a>')
  278.  
  279.     def testNewTagCreation(self):
  280.         "Makes sure tags don't step on each others' toes."
  281.         soup = BeautifulSoup()
  282.         a = Tag(soup, 'a')
  283.         ol = Tag(soup, 'ol')
  284.         a['href'] = 'http://foo.com/'
  285.         self.assertRaises(KeyError, lambda : ol['href'])
  286.  
  287.     def testTagReplacement(self):
  288.         # Make sure you can replace an element with itself.
  289.         text = "<a><b></b><c>Foo<d></d></c></a><a><e></e></a>"
  290.         soup = BeautifulSoup(text)
  291.         c = soup.c
  292.         soup.c.replaceWith(c)
  293.         self.assertEquals(soup.decode(), text)
  294.  
  295.         # A very simple case
  296.         soup = BeautifulSoup("<b>Argh!</b>")
  297.         soup.find(text="Argh!").replaceWith("Hooray!")
  298.         newText = soup.find(text="Hooray!")
  299.         b = soup.b
  300.         self.assertEqual(newText.previous, b)
  301.         self.assertEqual(newText.parent, b)
  302.         self.assertEqual(newText.previous.next, newText)
  303.         self.assertEqual(newText.next, None)
  304.  
  305.         # A more complex case
  306.         soup = BeautifulSoup("<a><b>Argh!</b><c></c><d></d></a>")
  307.         soup.b.insert(1, "Hooray!")
  308.         newText = soup.find(text="Hooray!")
  309.         self.assertEqual(newText.previous, "Argh!")
  310.         self.assertEqual(newText.previous.next, newText)
  311.  
  312.         self.assertEqual(newText.previousSibling, "Argh!")
  313.         self.assertEqual(newText.previousSibling.nextSibling, newText)
  314.  
  315.         self.assertEqual(newText.nextSibling, None)
  316.         self.assertEqual(newText.next, soup.c)
  317.  
  318.         text = "<html>There's <b>no</b> business like <b>show</b> business</html>"
  319.         soup = BeautifulSoup(text)
  320.         no, show = soup.findAll('b')
  321.         show.replaceWith(no)
  322.         self.assertEquals(soup.decode(), "<html>There's  business like <b>no</b> business</html>")
  323.  
  324.         # Even more complex
  325.         soup = BeautifulSoup("<a><b>Find</b><c>lady!</c><d></d></a>")
  326.         tag = Tag(soup, 'magictag')
  327.         tag.insert(0, "the")
  328.         soup.a.insert(1, tag)
  329.  
  330.         b = soup.b
  331.         c = soup.c
  332.         theText = tag.find(text=True)
  333.         findText = b.find(text="Find")
  334.  
  335.         self.assertEqual(findText.next, tag)
  336.         self.assertEqual(tag.previous, findText)
  337.         self.assertEqual(b.nextSibling, tag)
  338.         self.assertEqual(tag.previousSibling, b)
  339.         self.assertEqual(tag.nextSibling, c)
  340.         self.assertEqual(c.previousSibling, tag)
  341.  
  342.         self.assertEqual(theText.next, c)
  343.         self.assertEqual(c.previous, theText)
  344.  
  345.         # Aand... incredibly complex.
  346.         soup = BeautifulSoup("""<a>We<b>reserve<c>the</c><d>right</d></b></a><e>to<f>refuse</f><g>service</g></e>""")
  347.         f = soup.f
  348.         a = soup.a
  349.         c = soup.c
  350.         e = soup.e
  351.         weText = a.find(text="We")
  352.         soup.b.replaceWith(soup.f)
  353.         self.assertEqual(soup.decode(), "<a>We<f>refuse</f></a><e>to<g>service</g></e>")
  354.  
  355.         self.assertEqual(f.previous, weText)
  356.         self.assertEqual(weText.next, f)
  357.         self.assertEqual(f.previousSibling, weText)
  358.         self.assertEqual(f.nextSibling, None)
  359.         self.assertEqual(weText.nextSibling, f)
  360.  
  361.     def testAppend(self):
  362.        doc = "<p>Don't leave me <b>here</b>.</p> <p>Don't leave me.</p>"
  363.        soup = BeautifulSoup(doc)
  364.        second_para = soup('p')[1]
  365.        bold = soup.find('b')
  366.        soup('p')[1].append(soup.find('b'))
  367.        self.assertEqual(bold.parent, second_para)
  368.        self.assertEqual(soup.decode(),
  369.                         "<p>Don't leave me .</p> "
  370.                         "<p>Don't leave me.<b>here</b></p>")
  371.  
  372.     def testTagExtraction(self):
  373.         # A very simple case
  374.         text = '<html><div id="nav">Nav crap</div>Real content here.</html>'
  375.         soup = BeautifulSoup(text)
  376.         extracted = soup.find("div", id="nav").extract()
  377.         self.assertEqual(soup.decode(), "<html>Real content here.</html>")
  378.         self.assertEqual(extracted.decode(), '<div id="nav">Nav crap</div>')
  379.  
  380.         # A simple case, a more complex test.
  381.         text = "<doc><a>1<b>2</b></a><a>i<b>ii</b></a><a>A<b>B</b></a></doc>"
  382.         soup = BeautifulStoneSoup(text)
  383.         doc = soup.doc
  384.         numbers, roman, letters = soup("a")
  385.  
  386.         self.assertEqual(roman.parent, doc)
  387.         oldPrevious = roman.previous
  388.         endOfThisTag = roman.nextSibling.previous
  389.         self.assertEqual(oldPrevious, "2")
  390.         self.assertEqual(roman.next, "i")
  391.         self.assertEqual(endOfThisTag, "ii")
  392.         self.assertEqual(roman.previousSibling, numbers)
  393.         self.assertEqual(roman.nextSibling, letters)
  394.  
  395.         roman.extract()
  396.         self.assertEqual(roman.parent, None)
  397.         self.assertEqual(roman.previous, None)
  398.         self.assertEqual(roman.next, "i")
  399.         self.assertEqual(letters.previous, '2')
  400.         self.assertEqual(roman.previousSibling, None)
  401.         self.assertEqual(roman.nextSibling, None)
  402.         self.assertEqual(endOfThisTag.next, None)
  403.         self.assertEqual(roman.b.contents[0].next, None)
  404.         self.assertEqual(numbers.nextSibling, letters)
  405.         self.assertEqual(letters.previousSibling, numbers)
  406.         self.assertEqual(len(doc.contents), 2)
  407.         self.assertEqual(doc.contents[0], numbers)
  408.         self.assertEqual(doc.contents[1], letters)
  409.  
  410.         # A more complex case.
  411.         text = "<a>1<b>2<c>Hollywood, baby!</c></b></a>3"
  412.         soup = BeautifulStoneSoup(text)
  413.         one = soup.find(text="1")
  414.         three = soup.find(text="3")
  415.         toExtract = soup.b
  416.         soup.b.extract()
  417.         self.assertEqual(one.next, three)
  418.         self.assertEqual(three.previous, one)
  419.         self.assertEqual(one.parent.nextSibling, three)
  420.         self.assertEqual(three.previousSibling, soup.a)
  421.  
  422. class TheManWithoutAttributes(SoupTest):
  423.     "Test attribute access"
  424.  
  425.     def testHasKey(self):
  426.         text = "<foo attr='bar'>"
  427.         self.assertTrue(BeautifulSoup(text).foo.has_key('attr'))
  428.  
  429. class QuoteMeOnThat(SoupTest):
  430.     "Test quoting"
  431.     def testQuotedAttributeValues(self):
  432.         self.assertSoupEquals("<foo attr='bar'></foo>",
  433.                               '<foo attr="bar"></foo>')
  434.  
  435.         text = """<foo attr='bar "brawls" happen'>a</foo>"""
  436.         soup = BeautifulSoup(text)
  437.         self.assertEquals(soup.decode(), text)
  438.  
  439.         soup.foo['attr'] = 'Brawls happen at "Bob\'s Bar"'
  440.         newText = """<foo attr='Brawls happen at "Bob&squot;s Bar"'>a</foo>"""
  441.         self.assertSoupEquals(soup.decode(), newText)
  442.  
  443.         self.assertSoupEquals('<this is="really messed up & stuff">',
  444.                               '<this is="really messed up & stuff"></this>')
  445.  
  446.  
  447.  
  448. class YoureSoLiteral(SoupTest):
  449.     "Test literal mode."
  450.     def testLiteralMode(self):
  451.         text = "<script>if (i<imgs.length)</script><b>Foo</b>"
  452.         soup = BeautifulSoup(text)
  453.         self.assertEqual(soup.script.contents[0], "if (i<imgs.length)")
  454.         self.assertEqual(soup.b.contents[0], "Foo")
  455.  
  456.     def testTextArea(self):
  457.         text = "<textarea><b>This is an example of an HTML tag</b><&<&</textarea>"
  458.         soup = BeautifulSoup(text)
  459.         self.assertEqual(soup.textarea.contents[0],
  460.                          "<b>This is an example of an HTML tag</b><&<&")
  461.  
  462. class OperatorOverload(SoupTest):
  463.     "Our operators do it all! Call now!"
  464.  
  465.     def testTagNameAsFind(self):
  466.         "Tests that referencing a tag name as a member delegates to find()."
  467.         soup = BeautifulSoup('<b id="1">foo<i>bar</i></b><b>Red herring</b>')
  468.         self.assertEqual(soup.b.i, soup.find('b').find('i'))
  469.         self.assertEqual(soup.b.i.string, 'bar')
  470.         self.assertEqual(soup.b['id'], '1')
  471.         self.assertEqual(soup.b.contents[0], 'foo')
  472.         self.assert_(not soup.a)
  473.  
  474.         #Test the .fooTag variant of .foo.
  475.         self.assertEqual(soup.bTag.iTag.string, 'bar')
  476.         self.assertEqual(soup.b.iTag.string, 'bar')
  477.         self.assertEqual(soup.find('b').find('i'), soup.bTag.iTag)
  478.  
  479. class NestableEgg(SoupTest):
  480.     """Here we test tag nesting. TEST THE NEST, DUDE! X-TREME!"""
  481.  
  482.     def testParaInsideBlockquote(self):
  483.         soup = BeautifulSoup('<blockquote><p><b>Foo</blockquote><p>Bar')
  484.         self.assertEqual(soup.blockquote.p.b.string, 'Foo')
  485.         self.assertEqual(soup.blockquote.b.string, 'Foo')
  486.         self.assertEqual(soup.find('p', recursive=False).string, 'Bar')
  487.  
  488.     def testNestedTables(self):
  489.         text = """<table id="1"><tr><td>Here's another table:
  490.         <table id="2"><tr><td>Juicy text</td></tr></table></td></tr></table>"""
  491.         soup = BeautifulSoup(text)
  492.         self.assertEquals(soup.table.table.td.string, 'Juicy text')
  493.         self.assertEquals(len(soup.findAll('table')), 2)
  494.         self.assertEquals(len(soup.table.findAll('table')), 1)
  495.         self.assertEquals(soup.find('table', {'id' : 2}).parent.parent.parent.name,
  496.                           'table')
  497.  
  498.         text = "<table><tr><td><div><table>Foo</table></div></td></tr></table>"
  499.         soup = BeautifulSoup(text)
  500.         self.assertEquals(soup.table.tr.td.div.table.contents[0], "Foo")
  501.  
  502.         text = """<table><thead><tr>Foo</tr></thead><tbody><tr>Bar</tr></tbody>
  503.         <tfoot><tr>Baz</tr></tfoot></table>"""
  504.         soup = BeautifulSoup(text)
  505.         self.assertEquals(soup.table.thead.tr.contents[0], "Foo")
  506.  
  507.     def testBadNestedTables(self):
  508.         soup = BeautifulSoup("<table><tr><table><tr id='nested'>")
  509.         self.assertEquals(soup.table.tr.table.tr['id'], 'nested')
  510.  
  511. class CleanupOnAisleFour(SoupTest):
  512.     """Here we test cleanup of text that breaks HTMLParser or is just
  513.     obnoxious."""
  514.  
  515.     def testSelfClosingtag(self):
  516.         self.assertEqual(BeautifulSoup("Foo<br/>Bar").find('br').decode(),
  517.                          '<br />')
  518.  
  519.         self.assertSoupEquals('<p>test1<br/>test2</p>',
  520.                               '<p>test1<br />test2</p>')
  521.  
  522.         text = '<p>test1<selfclosing>test2'
  523.         soup = BeautifulStoneSoup(text)
  524.         self.assertEqual(soup.decode(),
  525.                          '<p>test1<selfclosing>test2</selfclosing></p>')
  526.  
  527.         soup = BeautifulStoneSoup(text, selfClosingTags='selfclosing')
  528.         self.assertEqual(soup.decode(),
  529.                          '<p>test1<selfclosing />test2</p>')
  530.  
  531.     def testSelfClosingTagOrNot(self):
  532.         text = "<item><link>http://foo.com/</link></item>"
  533.         self.assertEqual(BeautifulStoneSoup(text).decode(), text)
  534.         self.assertEqual(BeautifulSoup(text).decode(),
  535.                          '<item><link />http://foo.com/</item>')
  536.  
  537.     def testBooleanAttributes(self):
  538.         text = "<td nowrap>foo</td>"
  539.         self.assertSoupEquals(text, text)
  540.  
  541.     def testCData(self):
  542.         xml = "<root>foo<![CDATA[foobar]]>bar</root>"
  543.         self.assertSoupEquals(xml, xml)
  544.         r = re.compile("foo.*bar")
  545.         soup = BeautifulSoup(xml)
  546.         self.assertEquals(soup.find(text=r).string, "foobar")
  547.         self.assertEquals(soup.find(text=r).__class__, CData)
  548.  
  549.     def testComments(self):
  550.         xml = "foo<!--foobar-->baz"
  551.         self.assertSoupEquals(xml)
  552.         r = re.compile("foo.*bar")
  553.         soup = BeautifulSoup(xml)
  554.         self.assertEquals(soup.find(text=r).string, "foobar")
  555.         self.assertEquals(soup.find(text="foobar").__class__, Comment)
  556.  
  557.     def testDeclaration(self):
  558.         xml = "foo<!DOCTYPE foobar>baz"
  559.         self.assertSoupEquals(xml)
  560.         r = re.compile(".*foo.*bar")
  561.         soup = BeautifulSoup(xml)
  562.         text = "DOCTYPE foobar"
  563.         self.assertEquals(soup.find(text=r).string, text)
  564.         self.assertEquals(soup.find(text=text).__class__, Declaration)
  565.  
  566.         namespaced_doctype = ('<!DOCTYPE xsl:stylesheet SYSTEM "htmlent.dtd">'
  567.                               '<html>foo</html>')
  568.         soup = BeautifulSoup(namespaced_doctype)
  569.         self.assertEquals(soup.contents[0],
  570.                           'DOCTYPE xsl:stylesheet SYSTEM "htmlent.dtd"')
  571.         self.assertEquals(soup.html.contents[0], 'foo')
  572.  
  573.     def testEntityConversions(self):
  574.         text = "<<sacré bleu!>>"
  575.         soup = BeautifulStoneSoup(text)
  576.         self.assertSoupEquals(text)
  577.  
  578.         xmlEnt = BeautifulStoneSoup.XML_ENTITIES
  579.         htmlEnt = BeautifulStoneSoup.HTML_ENTITIES
  580.         xhtmlEnt = BeautifulStoneSoup.XHTML_ENTITIES
  581.  
  582.         soup = BeautifulStoneSoup(text, convertEntities=xmlEnt)
  583.         self.assertEquals(soup.decode(), "<<sacré bleu!>>")
  584.  
  585.         soup = BeautifulStoneSoup(text, convertEntities=xmlEnt)
  586.         self.assertEquals(soup.decode(), "<<sacré bleu!>>")
  587.  
  588.         soup = BeautifulStoneSoup(text, convertEntities=htmlEnt)
  589.         self.assertEquals(soup.decode(), u"<<sacr\xe9 bleu!>>")
  590.  
  591.         # Make sure the "XML", "HTML", and "XHTML" settings work.
  592.         text = "<™'"
  593.         soup = BeautifulStoneSoup(text, convertEntities=xmlEnt)
  594.         self.assertEquals(soup.decode(), u"<™'")
  595.  
  596.         soup = BeautifulStoneSoup(text, convertEntities=htmlEnt)
  597.         self.assertEquals(soup.decode(), u"<\u2122'")
  598.  
  599.         soup = BeautifulStoneSoup(text, convertEntities=xhtmlEnt)
  600.         self.assertEquals(soup.decode(), u"<\u2122'")
  601.  
  602.     def testNonBreakingSpaces(self):
  603.         soup = BeautifulSoup("<a>  </a>",
  604.                              convertEntities=BeautifulStoneSoup.HTML_ENTITIES)
  605.         self.assertEquals(soup.decode(), u"<a>\xa0\xa0</a>")
  606.  
  607.     def testWhitespaceInDeclaration(self):
  608.         self.assertSoupEquals('<! DOCTYPE>', '<!DOCTYPE>')
  609.  
  610.     def testJunkInDeclaration(self):
  611.         self.assertSoupEquals('<! Foo = -8>a', '<!Foo = -8>a')
  612.  
  613.     def testIncompleteDeclaration(self):
  614.         self.assertSoupEquals('a<!b <p>c')
  615.  
  616.     def testEntityReplacement(self):
  617.         self.assertSoupEquals('<b>hello there</b>')
  618.  
  619.     def testEntitiesInAttributeValues(self):
  620.         self.assertSoupEquals('<x t="xñ">', '<x t="x\xc3\xb1"></x>',
  621.                               encoding='utf-8')
  622.         self.assertSoupEquals('<x t="xñ">', '<x t="x\xc3\xb1"></x>',
  623.                               encoding='utf-8')
  624.  
  625.         soup = BeautifulSoup('<x t=">™">',
  626.                              convertEntities=BeautifulStoneSoup.HTML_ENTITIES)
  627.         self.assertEquals(soup.decode(), u'<x t=">\u2122"></x>')
  628.  
  629.         uri = "http://crummy.com?sacré&bleu"
  630.         link = '<a href="%s"></a>' % uri
  631.  
  632.         soup = BeautifulSoup(link, convertEntities=BeautifulSoup.HTML_ENTITIES)
  633.         self.assertEquals(soup.decode(),
  634.                           link.replace("é", u"\xe9"))
  635.  
  636.         uri = "http://crummy.com?sacré&bleu"
  637.         link = '<a href="%s"></a>' % uri
  638.         soup = BeautifulSoup(link, convertEntities=BeautifulSoup.HTML_ENTITIES)
  639.         self.assertEquals(soup.a['href'],
  640.                           uri.replace("é", u"\xe9"))
  641.  
  642.     def testNakedAmpersands(self):
  643.         html = {'convertEntities':BeautifulStoneSoup.HTML_ENTITIES}
  644.         soup = BeautifulStoneSoup("AT&T ", **html)
  645.         self.assertEquals(soup.decode(), 'AT&T ')
  646.  
  647.         nakedAmpersandInASentence = "AT&T was Ma Bell"
  648.         soup = BeautifulStoneSoup(nakedAmpersandInASentence,**html)
  649.         self.assertEquals(soup.decode(), \
  650.                nakedAmpersandInASentence.replace('&','&'))
  651.  
  652.         invalidURL = '<a href="http://example.org?a=1&b=2;3">foo</a>'
  653.         validURL = invalidURL.replace('&','&')
  654.         soup = BeautifulStoneSoup(invalidURL)
  655.         self.assertEquals(soup.decode(), validURL)
  656.  
  657.         soup = BeautifulStoneSoup(validURL)
  658.         self.assertEquals(soup.decode(), validURL)
  659.  
  660.  
  661. class EncodeRed(SoupTest):
  662.     """Tests encoding conversion, Unicode conversion, and Microsoft
  663.     smart quote fixes."""
  664.  
  665.     def testUnicodeDammitStandalone(self):
  666.         markup = "<foo>\x92</foo>"
  667.         dammit = UnicodeDammit(markup)
  668.         self.assertEquals(dammit.unicode, "<foo>’</foo>")
  669.  
  670.         hebrew = "\xed\xe5\xec\xf9"
  671.         dammit = UnicodeDammit(hebrew, ["iso-8859-8"])
  672.         self.assertEquals(dammit.unicode, u'\u05dd\u05d5\u05dc\u05e9')
  673.         self.assertEquals(dammit.originalEncoding, 'iso-8859-8')
  674.  
  675.     def testGarbageInGarbageOut(self):
  676.         ascii = "<foo>a</foo>"
  677.         asciiSoup = BeautifulStoneSoup(ascii)
  678.         self.assertEquals(ascii, asciiSoup.decode())
  679.  
  680.         unicodeData = u"<foo>\u00FC</foo>"
  681.         utf8 = unicodeData.encode("utf-8")
  682.         self.assertEquals(utf8, '<foo>\xc3\xbc</foo>')
  683.  
  684.         unicodeSoup = BeautifulStoneSoup(unicodeData)
  685.         self.assertEquals(unicodeData, unicodeSoup.decode())
  686.         self.assertEquals(unicodeSoup.foo.string, u'\u00FC')
  687.  
  688.         utf8Soup = BeautifulStoneSoup(utf8, fromEncoding='utf-8')
  689.         self.assertEquals(utf8, utf8Soup.encode('utf-8'))
  690.         self.assertEquals(utf8Soup.originalEncoding, "utf-8")
  691.  
  692.         utf8Soup = BeautifulStoneSoup(unicodeData)
  693.         self.assertEquals(utf8, utf8Soup.encode('utf-8'))
  694.         self.assertEquals(utf8Soup.originalEncoding, None)
  695.  
  696.  
  697.     def testHandleInvalidCodec(self):
  698.         for bad_encoding in ['.utf8', '...', 'utF---16.!']:
  699.             soup = BeautifulSoup(u"R├ñksm├╢rg├Ñs".encode("utf-8"),
  700.                                  fromEncoding=bad_encoding)
  701.             self.assertEquals(soup.originalEncoding, 'utf-8')
  702.  
  703.     def testUnicodeSearch(self):
  704.         html = u'<html><body><h1>R├ñksm├╢rg├Ñs</h1></body></html>'
  705.         soup = BeautifulSoup(html)
  706.         self.assertEqual(soup.find(text=u'R├ñksm├╢rg├Ñs'),u'R├ñksm├╢rg├Ñs')
  707.  
  708.     def testRewrittenXMLHeader(self):
  709.         euc_jp = '<?xml version="1.0 encoding="euc-jp"?>\n<foo>\n\xa4\xb3\xa4\xec\xa4\xcfEUC-JP\xa4\xc7\xa5\xb3\xa1\xbc\xa5\xc7\xa5\xa3\xa5\xf3\xa5\xb0\xa4\xb5\xa4\xec\xa4\xbf\xc6\xfc\xcb\xdc\xb8\xec\xa4\xce\xa5\xd5\xa5\xa1\xa5\xa4\xa5\xeb\xa4\xc7\xa4\xb9\xa1\xa3\n</foo>\n'
  710.         utf8 = "<?xml version='1.0' encoding='utf-8'?>\n<foo>\n\xe3\x81\x93\xe3\x82\x8c\xe3\x81\xafEUC-JP\xe3\x81\xa7\xe3\x82\xb3\xe3\x83\xbc\xe3\x83\x87\xe3\x82\xa3\xe3\x83\xb3\xe3\x82\xb0\xe3\x81\x95\xe3\x82\x8c\xe3\x81\x9f\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e\xe3\x81\xae\xe3\x83\x95\xe3\x82\xa1\xe3\x82\xa4\xe3\x83\xab\xe3\x81\xa7\xe3\x81\x99\xe3\x80\x82\n</foo>\n"
  711.         soup = BeautifulStoneSoup(euc_jp)
  712.         if soup.originalEncoding != "euc-jp":
  713.             raise Exception("Test failed when parsing euc-jp document. "
  714.                             "If you're running Python >=2.4, or you have "
  715.                             "cjkcodecs installed, this is a real problem. "
  716.                             "Otherwise, ignore it.")
  717.  
  718.         self.assertEquals(soup.originalEncoding, "euc-jp")
  719.         self.assertEquals(soup.renderContents('utf-8'), utf8)
  720.  
  721.         old_text = "<?xml encoding='windows-1252'><foo>\x92</foo>"
  722.         new_text = "<?xml version='1.0' encoding='utf-8'?><foo>’</foo>"
  723.         self.assertSoupEquals(old_text, new_text)
  724.  
  725.     def testRewrittenMetaTag(self):
  726.         no_shift_jis_html = '''<html><head>\n<meta http-equiv="Content-language" content="ja" /></head><body><pre>\n\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B\n</pre></body></html>'''
  727.         soup = BeautifulSoup(no_shift_jis_html)
  728.  
  729.         # Beautiful Soup used to try to rewrite the meta tag even if the
  730.         # meta tag got filtered out by the strainer. This test makes
  731.         # sure that doesn't happen.
  732.         strainer = SoupStrainer('pre')
  733.         soup = BeautifulSoup(no_shift_jis_html, parseOnlyThese=strainer)
  734.         self.assertEquals(soup.contents[0].name, 'pre')
  735.  
  736.         meta_tag = ('<meta content="text/html; charset=x-sjis" '
  737.                     'http-equiv="Content-type" />')
  738.         shift_jis_html = (
  739.             '<html><head>\n%s\n'
  740.             '<meta http-equiv="Content-language" content="ja" />'
  741.             '</head><body><pre>\n'
  742.             '\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f'
  743.             '\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c'
  744.             '\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B\n'
  745.             '</pre></body></html>') % meta_tag
  746.         soup = BeautifulSoup(shift_jis_html)
  747.         if soup.originalEncoding != "shift-jis":
  748.             raise Exception("Test failed when parsing shift-jis document "
  749.                             "with meta tag '%s'."
  750.                             "If you're running Python >=2.4, or you have "
  751.                             "cjkcodecs installed, this is a real problem. "
  752.                             "Otherwise, ignore it." % meta_tag)
  753.         self.assertEquals(soup.originalEncoding, "shift-jis")
  754.  
  755.         content_type_tag = soup.meta['content']
  756.         self.assertEquals(content_type_tag[content_type_tag.find('charset='):],
  757.                           'charset=%SOUP-ENCODING%')
  758.         content_type = str(soup.meta)
  759.         index = content_type.find('charset=')
  760.         self.assertEqual(content_type[index:index+len('charset=utf8')+1],
  761.                          'charset=utf-8')
  762.         content_type = soup.meta.encode('shift-jis')
  763.         index = content_type.find('charset=')
  764.         self.assertEqual(content_type[index:index+len('charset=shift-jis')],
  765.                          'charset=shift-jis'.encode())
  766.  
  767.         self.assertEquals(soup.encode('utf-8'), (
  768.                 '<html><head>\n'
  769.                 '<meta content="text/html; charset=utf-8" '
  770.                 'http-equiv="Content-type" />\n'
  771.                 '<meta http-equiv="Content-language" content="ja" />'
  772.                 '</head><body><pre>\n'
  773.                 '\xe3\x81\x93\xe3\x82\x8c\xe3\x81\xafShift-JIS\xe3\x81\xa7\xe3'
  774.                 '\x82\xb3\xe3\x83\xbc\xe3\x83\x87\xe3\x82\xa3\xe3\x83\xb3\xe3'
  775.                 '\x82\xb0\xe3\x81\x95\xe3\x82\x8c\xe3\x81\x9f\xe6\x97\xa5\xe6'
  776.                 '\x9c\xac\xe8\xaa\x9e\xe3\x81\xae\xe3\x83\x95\xe3\x82\xa1\xe3'
  777.                 '\x82\xa4\xe3\x83\xab\xe3\x81\xa7\xe3\x81\x99\xe3\x80\x82\n'
  778.                 '</pre></body></html>'))
  779.         self.assertEquals(soup.encode("shift-jis"),
  780.                           shift_jis_html.replace('x-sjis'.encode(),
  781.                                                  'shift-jis'.encode()))
  782.  
  783.         isolatin = """<html><meta http-equiv="Content-type" content="text/html; charset=ISO-Latin-1" />Sacr\xe9 bleu!</html>"""
  784.         soup = BeautifulSoup(isolatin)
  785.  
  786.         utf8 = isolatin.replace("ISO-Latin-1".encode(), "utf-8".encode())
  787.         utf8 = utf8.replace("\xe9", "\xc3\xa9")
  788.         self.assertSoupEquals(soup.encode("utf-8"), utf8, encoding='utf-8')
  789.  
  790.     def testHebrew(self):
  791.         iso_8859_8= '<HEAD>\n<TITLE>Hebrew (ISO 8859-8) in Visual Directionality</TITLE>\n\n\n\n</HEAD>\n<BODY>\n<H1>Hebrew (ISO 8859-8) in Visual Directionality</H1>\n\xed\xe5\xec\xf9\n</BODY>\n'
  792.         utf8 = '<head>\n<title>Hebrew (ISO 8859-8) in Visual Directionality</title>\n</head>\n<body>\n<h1>Hebrew (ISO 8859-8) in Visual Directionality</h1>\n\xd7\x9d\xd7\x95\xd7\x9c\xd7\xa9\n</body>\n'
  793.         soup = BeautifulStoneSoup(iso_8859_8, fromEncoding="iso-8859-8")
  794.         self.assertEquals(soup.encode('utf-8'), utf8)
  795.  
  796.     def testSmartQuotesNotSoSmartAnymore(self):
  797.         self.assertSoupEquals("\x91Foo\x92 <!--blah-->",
  798.                               '‘Foo’ <!--blah-->')
  799.  
  800.     def testDontConvertSmartQuotesWhenAlsoConvertingEntities(self):
  801.         smartQuotes = "Il a dit, \x8BSacré bleu!\x9b"
  802.         soup = BeautifulSoup(smartQuotes)
  803.         self.assertEquals(soup.decode(),
  804.                           'Il a dit, ‹Sacré bleu!›')
  805.         soup = BeautifulSoup(smartQuotes, convertEntities="html")
  806.         self.assertEquals(soup.encode('utf-8'),
  807.                           'Il a dit, \xe2\x80\xb9Sacr\xc3\xa9 bleu!\xe2\x80\xba')
  808.  
  809.     def testDontSeeSmartQuotesWhereThereAreNone(self):
  810.         utf_8 = "\343\202\261\343\203\274\343\202\277\343\202\244 Watch"
  811.         self.assertSoupEquals(utf_8, encoding='utf-8')
  812.  
  813.  
  814. class Whitewash(SoupTest):
  815.     """Test whitespace preservation."""
  816.  
  817.     def testPreservedWhitespace(self):
  818.         self.assertSoupEquals("<pre>   </pre>")
  819.         self.assertSoupEquals("<pre> woo  </pre>")
  820.  
  821.     def testCollapsedWhitespace(self):
  822.         self.assertSoupEquals("<p>   </p>", "<p> </p>")
  823.  
  824.  
  825. if __name__ == '__main__':
  826.     unittest.main()
  827.